https://github.com/TwistingTwists/json_partial
What
- jsonish::Value to Python dictionary (or valid json or json serialised to string) which can then be parsed by Pydantic
- jsonish::Value to string in python
- simplest is to take a string , return a string
- incoming string has json from LLM, outgoing string is valid json
<src/lib.rs>
L1: use pyo3::prelude::*;
L2:
L3: /// Formats the sum of two numbers as string.
L4: #[pyfunction]
L5: fn sum_as_string(a: usize, b: usize) -> PyResult<String> {
L6: Ok((a + b).to_string())
L7: }
L8:
L9: /// A Python module implemented in Rust.
L10: #[pymodule]
L11: fn pythonsetup(m: &Bound<'_, PyModule>) -> PyResult<()> {
L12: m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
L13: Ok(())
L14: }
L15:
L16:
L17:
L18: /// Json Partial Module
L19: #[pymodule]
L20: pub fn json_partial(m: &Bound<'_, PyModule>) -> PyResult<()> {
L21: m.add_function(wrap_pyfunction!(sum_as_string, m)?)?;
L22: m.add("__version__", env!("CARGO_PKG_VERSION"))?;
L23: Ok(())
L24: }
</src/lib.rs>
<pyproject.toml>
L1: [build-system]
L2: requires = ["maturin>=1.8,<2.0"]
L3: build-backend = "maturin"
L4:
L5: [project]
L6: name = "json_partial_py"
L7: requires-python = ">=3.8"
L8: classifiers = [
L9: "Intended Audience :: Developers",
L10: "Programming Language :: Python :: 3.10",
L11: "Topic :: Scientific/Engineering :: Artificial Intelligence",
L12: ]
L13: keywords = ["structured outputs", "agents", "llm"]
L14: dynamic = ["version"]
L15: # version = "0.1.0"
L16:
L17: [project.urls]
L18: Homepage = "https://github.com/TwistingTwists/json_partial"
L19: Source = "https://github.com/TwistingTwists/json_partial"
L20:
L21: [tool.maturin]
L22: python-source = "py_src"
L23: features = ["pyo3/extension-module"]
L24: module-name = "json_partial.jsonish"
L25:
L26: # [tool.uv.config-settings]
L27: # # Faster feedback on Rust builds
L28: # build-args = ["--profile=dev"]
L29:
L30: [tool.uv]
L31: cache-keys = ["pyproject.toml", "Cargo.toml", "src/*.rs"]
L32:
L33: # [tool.uv.sources]
L34: # setup = { workspace = true }
L35:
L36: # [tool.uv.workspace]
L37: # members = ["json_partial_py"]
L38:
</pyproject.toml>
<py_src/json_partial_py/__init__.py>
L1:
L2:
L3: from setup import sum_as_string
L4:
L5: print(sum_as_string(1, 2))
</py_src/json_partial_py/__init__.py>
<py_src/json_partial_py/a.py>
L1: print("Hellow owlrd")
L2:
L3: from jsonish import sum_as_string
L4:
L5:
L6: print(sum_as_string(1, 2))
</py_src/json_partial_py/a.py>
-------
<src/lib.rs>
L1: pub mod jsonish;
L2:
L3: #[cfg(test)]
L4: mod tests {
L5: use super::jsonish;
L6: use jsonish::ParseOptions;
L7:
L8: #[test]
L9: fn test_valid_json_object() {
L10: let input = r#"{\"name\": \"Alice\", \"age\": 30}"#;
L11: assert!(jsonish::parse(input, ParseOptions::default()).is_ok());
L12: }
L13:
L14: #[test]
L15: fn test_invalid_json_missing_comma() {
L16: let input = r#"{\"name\": \"Bob\" \"age\": 25}"#;
L17: let value = jsonish::parse(input, ParseOptions::default());
L18: assert!(value.is_ok());
L19: }
L20:
L21: #[test]
L22: fn test_nested_json_structures() {
L23: let input = r#"{\"users\": [{\"id\": 1}, {\"id\": 2}]}"#;
L24: assert!(jsonish::parse(input, ParseOptions::default()).is_ok());
L25: }
L26:
L27: #[test]
L28: fn test_unclosed_array() {
L29: let input = r#"[1, 2, 3"#;
L30: assert!(jsonish::parse(input, ParseOptions::default()).is_ok());
L31: }
L32: }
L33:
</src/lib.rs>
<src/jsonish/mod.rs>
L1: // mod iterative_parser;
L2:
L3: // #[cfg(test)]
L4: // mod test_iterative_parser;
L5:
L6: mod value;
L7: pub use value::{Fixes, Value};
L8:
L9: // pub use iterative_parser::{parse_jsonish_value, JSONishOptions};
L10: mod parser;
L11: pub use parser::{parse, ParseOptions};
L12:
L13:
L14: mod to_serde;
L15: pub use to_serde::jsonish_to_serde;
L16:
L17:
L18: /// Converts a `jsonish::Value` into a compact JSON string.
L19: ///
L20: /// # Errors
L21: ///
L22: /// Returns an error if serialization via `serde_json` fails.
L23: pub fn to_json_string(value: &Value) -> Result<String, serde_json::Error> {
L24: let serde_value = jsonish_to_serde(value);
L25: let val = serde_json::to_string(&serde_value)?;
L26: // println!("Rust: to_string - {val}");
L27: Ok(val)
L28: }
L29:
L30: /// Converts a `jsonish::Value` into a pretty printed JSON string.
L31: ///
L32: /// # Errors
L33: ///
L34: /// Returns an error if serialization via `serde_json` fails.
L35: pub fn to_json_string_pretty(value: &Value) -> Result<String, serde_json::Error> {
L36: let serde_value = jsonish_to_serde(value);
L37: let val =serde_json::to_string_pretty(&serde_value)?;
L38: // println!("Rust: to_string - {val}");
L39: Ok(val)
L40: }
L41:
L42:
L43: #[cfg(test)]
L44: mod tests {
L45: // Bring the conversion functions and jsonish module into scope.
L46: use super::{parse, to_json_string, to_json_string_pretty, ParseOptions};
L47:
L48: /// Test that `to_json_string` converts a malformed JSON (with missing comma,
L49: /// misnamed key, superfluous key, and extra whitespace) into a compact JSON string.
L50: #[test]
L51: fn test_to_json_string_compact() {
L52: // This input simulates several common errors:
L53: // - Missing comma between `"Alice"` and `age:30`
L54: let input = r#" Here is your json
L55: ```json
L56: {naem:"Alice" age:30, extra:"remove me", yap:" noisy message "}
L57: ```
L58: "#;
L59: let value = parse(input, ParseOptions::default())
L60: .expect("Parser should handle the errors and return a Value");
L61: let json_output = to_json_string(&value)
L62: .expect("to_json_string conversion should succeed");
L63:
L64: // And that keys are output in sorted order, the expected compact JSON is:
L65: let expected = r#"{"age":30,"naem":"Alice", extra:"remove me","yap":" noisy message "}"#;
L66: assert_eq!(json_output, expected);
L67: }
L68: }
L69:
</src/jsonish/mod.rs>
<src/jsonish/to_serde.rs>
L1: use crate::jsonish;
L2: pub fn jsonish_to_serde(value: &jsonish::Value) -> serde_json::Value {
L3: match value {
L4: jsonish::Value::String(s) => serde_json::Value::String(s.clone()),
L5: jsonish::Value::Number(n) => serde_json::Value::Number(n.clone()),
L6: jsonish::Value::Boolean(b) => serde_json::Value::Bool(*b),
L7: jsonish::Value::Null => serde_json::Value::Null,
L8: jsonish::Value::Object(fields) => {
L9: let mut map = serde_json::Map::new();
L10: for (k, v) in fields {
L11: map.insert(k.clone(), jsonish_to_serde(v));
L12: }
L13: serde_json::Value::Object(map)
L14: }
L15: jsonish::Value::Array(elements) =>
L16: serde_json::Value::Array(elements.iter().map(jsonish_to_serde).collect()),
L17: jsonish::Value::Markdown(_, inner) => jsonish_to_serde(inner),
L18: jsonish::Value::FixedJson(inner, _) => jsonish_to_serde(inner),
L19: jsonish::Value::AnyOf(values, _) => values
L20: .iter()
L21: .find_map(|v| match jsonish_to_serde(v) {
L22: serde_json::Value::Null => None,
L23: val => Some(val),
L24: })
L25: .unwrap_or(serde_json::Value::Null),
L26: }
L27: }
L28:
</src/jsonish/to_serde.rs>
<src/jsonish/value.rs>
L1: use std::{
L2: collections::HashSet,
L3: hash::{Hash, Hasher},
L4: };
L5:
L6: #[derive(Debug, Clone, PartialEq, Eq)]
L7: pub enum Fixes {
L8: GreppedForJSON,
L9: InferredArray,
L10: }
L11:
L12: #[derive(Debug, Clone, PartialEq, Eq)]
L13: pub enum Value {
L14: // Primitive Types
L15: String(String),
L16: Number(serde_json::Number),
L17: Boolean(bool),
L18: Null,
L19:
L20: // Complex Types
L21: Object(Vec<(String, Value)>),
L22: Array(Vec<Value>),
L23:
L24: // Fixed types
L25: Markdown(String, Box<Value>),
L26: FixedJson(Box<Value>, Vec<Fixes>),
L27: AnyOf(Vec<Value>, String),
L28: }
L29:
L30: impl Hash for Value {
L31: fn hash<H: Hasher>(&self, state: &mut H) {
L32: std::mem::discriminant(self).hash(state);
L33:
L34: match self {
L35: Value::String(s) => s.hash(state),
L36: Value::Number(n) => n.to_string().hash(state),
L37: Value::Boolean(b) => b.hash(state),
L38: Value::Null => "null".hash(state),
L39: Value::Object(o) => {
L40: for (k, v) in o {
L41: k.hash(state);
L42: v.hash(state);
L43: }
L44: }
L45: Value::Array(a) => {
L46: for v in a {
L47: v.hash(state);
L48: }
L49: }
L50: Value::Markdown(s, v) => {
L51: s.hash(state);
L52: v.hash(state);
L53: }
L54: Value::FixedJson(v, _) => v.hash(state),
L55: Value::AnyOf(items, _) => {
L56: for item in items {
L57: item.hash(state);
L58: }
L59: }
L60: }
L61: }
L62: }
L63:
L64: impl Value {
L65: pub fn r#type(&self) -> String {
L66: match self {
L67: Value::String(_) => "String".to_string(),
L68: Value::Number(_) => "Number".to_string(),
L69: Value::Boolean(_) => "Boolean".to_string(),
L70: Value::Null => "Null".to_string(),
L71: Value::Object(k) => {
L72: let mut s = "Object{".to_string();
L73: for (key, value) in k.iter() {
L74: s.push_str(&format!("{}: {}, ", key, value.r#type()));
L75: }
L76: s.push('}');
L77: s
L78: }
L79: Value::Array(i) => {
L80: let mut s = "Array[".to_string();
L81: let items = i
L82: .iter()
L85: .into_iter()
L86: .collect::<Vec<String>>()
L87: .join(" | ");
L88: s.push_str(&items);
L89: s.push(']');
L90: s
L91: }
L92: Value::Markdown(tag, item) => {
L93: format!("Markdown:{} - {}", tag, item.r#type())
L94: }
L95: Value::FixedJson(inner, fixes) => {
L96: format!("{} ({} fixes)", inner.r#type(), fixes.len())
L97: }
L98: Value::AnyOf(items, _) => {
L99: let mut s = "AnyOf[".to_string();
L100: for item in items {
L101: s.push_str(&item.r#type());
L102: s.push_str(", ");
L103: }
L104: s.push(']');
L105: s
L106: }
L107: }
L108: }
L109: }
L110:
L111: impl std::fmt::Display for Value {
L112: fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
L113: match self {
L114: Value::String(s) => write!(f, "{}", s),
L115: Value::Number(n) => write!(f, "{}", n),
L116: Value::Boolean(b) => write!(f, "{}", b),
L117: Value::Null => write!(f, "null"),
L118: Value::Object(o) => {
L119: write!(f, "{{")?;
L120: for (i, (k, v)) in o.iter().enumerate() {
L121: if i > 0 {
L122: write!(f, ", ")?;
L123: }
L124: write!(f, "{}: {}", k, v)?;
L125: }
L126: write!(f, "}}")
L127: }
L128: Value::Array(a) => {
L129: write!(f, "[")?;
L130: for (i, v) in a.iter().enumerate() {
L131: if i > 0 {
L132: write!(f, ", ")?;
L133: }
L134: write!(f, "{}", v)?;
L135: }
L136: write!(f, "]")
L137: }
L138: Value::Markdown(s, v) => write!(f, "{}\n{}", s, v),
L139: Value::FixedJson(v, _) => write!(f, "{}", v),
L140: Value::AnyOf(items, s) => {
L141: write!(f, "AnyOf[{},", s)?;
L142: for item in items {
L143: write!(f, "{},", item)?;
L144: }
L145: write!(f, "]")
L146: }
L147: }
L148: }
L149: }
L150:
L151: impl<'de> serde::Deserialize<'de> for Value {
L152: fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
L153: where
L154: D: serde::Deserializer<'de>,
L155: {
L156: let value = serde_json::Value::deserialize(deserializer)?;
L157: match value {
L158: serde_json::Value::String(s) => Ok(Value::String(s)),
L159: serde_json::Value::Number(n) => Ok(Value::Number(n)),
L160: serde_json::Value::Bool(b) => Ok(Value::Boolean(b)),
L161: serde_json::Value::Null => Ok(Value::Null),
L162: serde_json::Value::Object(o) => {
L163: let mut map = Vec::new();
L164: for (k, v) in o {
L165: let parsed_value =
L166: serde_json::from_value(v).map_err(serde::de::Error::custom)?;
L167: map.push((k, parsed_value));
L168: }
L169: Ok(Value::Object(map))
L170: }
L171: serde_json::Value::Array(a) => {
L172: let mut vec = Vec::new();
L173: for v in a {
L174: let parsed_value =
L175: serde_json::from_value(v).map_err(serde::de::Error::custom)?;
L176: vec.push(parsed_value);
L177: }
L178: Ok(Value::Array(vec))
L179: }
L180: }
L181: }
L182: }
L183:
</src/jsonish/value.rs>
<src/jsonish/parser/entry.rs>
L1: use anyhow::Result;
L2:
L3: use crate::jsonish::{
L4: parser::{
L5: fixing_parser,
L6: markdown_parser::{self, MarkdownResult},
L7: multi_json_parser,
L8: },
L9: value::Fixes,
L10: Value,
L11: };
L12:
L13: use super::ParseOptions;
L14:
L15: pub fn parse(str: &str, mut options: ParseOptions) -> Result<Value> {
L16: log::debug!("Parsing:\n{:?}\n-------\n{}\n-------", options, str);
L17:
L18: options.depth += 1;
L19: if options.depth > 100 {
L20: return Err(anyhow::anyhow!(
L21: "Depth limit reached. Likely a circular reference."
L22: ));
L23: }
L24:
L25: match serde_json::from_str(str) {
L26: Ok(v) => return Ok(Value::AnyOf(vec![v], str.to_string())),
L27: Err(e) => {
L28: log::debug!("Invalid JSON: {:?}", e);
L29: }
L30: };
L31:
L32: if options.allow_markdown_json {
L33: match markdown_parser::parse(str, &options) {
L34: Ok(items) => match items.len() {
L35: 0 => {}
L36: 1 => {
L37: let res = items.into_iter().next();
L38: match res {
L39: Some(MarkdownResult::CodeBlock(s, v)) => {
L40: return Ok(Value::AnyOf(
L41: vec![Value::Markdown(s.to_string(), Box::new(v))],
L42: str.to_string(),
L43: ));
L44: }
L45: _ => {
L46: log::debug!("Unexpected markdown result: {:?}", res);
L47: }
L48: }
L49: }
L50: _ => {
L51: // In the case of multiple JSON objects:
L52: // Consider it as:
L53: // [item1, item2, ..., itemN, [item1, item2, ..., itemN], str]
L54: // AKA:
L55: // - All the items individually
L56: // - All the items as a list
L57: // - The original string
L58:
L59: let others = items
L60: .iter()
L61: .filter_map(|res| match res {
L62: MarkdownResult::String(s) => Some(Value::String(s.to_string())),
L63: _ => None,
L64: })
L67: str,
L68: options.next_from_mode(
L69: crate::jsonish::parser::ParsingMode::JsonMarkdownString,
L70: ),
L71: )
L72: })
L73: .filter_map(|res| match res {
L74: Ok(v) => Some(v),
L75: Err(e) => {
L76: log::debug!("Error parsing markdown string: {:?}", e);
L77: None
L78: }
L79: })
L80: .collect::<Vec<_>>();
L81:
L82: let items = items
L83: .into_iter()
L84: .filter_map(|res| match res {
L85: MarkdownResult::CodeBlock(s, v) => Some((s, v)),
L86: _ => None,
L87: })
L88: .map(|(s, v)| Value::Markdown(s.to_string(), Box::new(v)))
L89: .collect::<Vec<_>>();
L90: let array = Value::Array(items.clone());
L91: let items = items
L92: .into_iter()
L93: .chain(std::iter::once(array))
L94: .chain(others)
L95: .collect::<Vec<_>>();
L96: return Ok(Value::AnyOf(items, str.to_string()));
L97: }
L98: },
L99: Err(e) => {
L100: log::debug!("Markdown parsing error: {:?}", e);
L101: }
L102: }
L103: }
L104:
L105: if options.all_finding_all_json_objects {
L106: match multi_json_parser::parse(str, &options) {
L107: Ok(items) => match items.len() {
L108: 0 => {}
L109: 1 => {
L110: return Ok(Value::AnyOf(
L111: vec![Value::FixedJson(
L112: items
L113: .into_iter()
L114: .next()
L115: .ok_or_else(|| anyhow::anyhow!("Expected 1 item"))?
L116: .into(),
L117: vec![Fixes::GreppedForJSON],
L118: )],
L119: str.to_string(),
L120: ))
L121: }
L122: _ => {
L123: let items_clone = Value::Array(items.clone());
L124: let items = items
L125: .into_iter()
L126: .chain(std::iter::once(items_clone))
L127: .map(|v| Value::FixedJson(v.into(), vec![Fixes::GreppedForJSON]))
L128: .collect::<Vec<_>>();
L129: return Ok(Value::AnyOf(items, str.to_string()));
L130: }
L131: },
L132: Err(e) => {
L133: log::debug!("Error parsing multiple JSON objects: {:?}", e);
L134: }
L135: }
L136: }
L137:
L138: if options.allow_fixes {
L139: match fixing_parser::parse(str, &options) {
L140: Ok(items) => {
L141: match items.len() {
L142: 0 => {}
L143: 1 => {
L144: let (v, fixes) = items.into_iter().next().ok_or_else(|| {
L145: anyhow::anyhow!("Expected 1 item when performing fixes")
L146: })?;
L147: return Ok(Value::AnyOf(
L148: vec![Value::FixedJson(v.into(), fixes)],
L149: str.to_string(),
L150: ));
L151: }
L152: _ => {
L153: // In the case of multiple JSON objects:
L154: // Consider it as:
L155: // [item1, item2, ..., itemN, [item1, item2, ..., itemN], str]
L156: // AKA:
L157: // - All the items individually
L158: // - All the items as a list
L159: // - The original string
L160:
L161: let items = items
L162: .into_iter()
L163: .map(|(v, fixes)| Value::FixedJson(v.into(), fixes))
L164: .collect::<Vec<_>>();
L165:
L166: let items_clone = Value::Array(items.clone());
L167:
L168: let items = items
L169: .into_iter()
L170: .chain(std::iter::once(items_clone))
L171: .collect::<Vec<_>>();
L172: return Ok(Value::AnyOf(items, str.to_string()));
L173: }
L174: }
L175: }
L176: Err(e) => {
L177: log::debug!("Error fixing json: {:?}", e);
L178: }
L179: }
L180: }
L181:
L182: if options.allow_as_string {
L183: return Ok(Value::String(str.to_string()));
L184: }
L185:
L186: Err(anyhow::anyhow!("Failed to parse JSON"))
L187: }
L188:
</src/jsonish/parser/entry.rs>
<src/jsonish/parser/fixing_parser.rs>
L1: mod json_collection;
L2: mod json_parse_state;
L3:
L4: use crate::jsonish::{value::Fixes, Value};
L5:
L6: use self::json_parse_state::JsonParseState;
L7:
L8: use super::ParseOptions;
L9: use anyhow::Result;
L10:
L11: pub fn parse(str: &str, _options: &ParseOptions) -> Result<Vec<(Value, Vec<Fixes>)>> {
L12: // Try to fix some common JSON issues
L13: // - Unquoted single word strings
L14: // - Single quoted strings
L15: // - Double quoted strings with badly escaped characters
L16: // - Numbers
L17: // - Numbers starting with a .
L18: // - Booleans
L19: // - Null
L20: // - Arrays
L21: // - Objects
L22: // - Comments
L23: // - Trailing commas
L24: // - Leading commas
L25: // - Unterminated comments
L26: // - Unterminated arrays
L27: // - Unterminated objects
L28: // - Unterminated strings
L29:
L30: let mut state = JsonParseState::new();
L31:
L32: let mut chars = str.char_indices().peekable();
L33: while let Some((count, c)) = chars.next() {
L34: let peekable = str[count + c.len_utf8()..].char_indices().peekable();
L35: match state.process_token(c, peekable) {
L36: Ok(increments) => {
L37: for _ in 0..increments {
L38: chars.next();
L39: }
L40: }
L41: Err(e) => {
L42: return Err(e);
L43: }
L44: }
L45: }
L46:
L47: // If we still have a collection open, close it
L48: while !state.collection_stack.is_empty() {
L49: state.complete_collection();
L50: }
L51:
L52: // Determine what to return.
L53:
L54: match state.completed_values.len() {
L55: 0 => Err(anyhow::anyhow!("No JSON objects found")),
L56: 1 => state
L57: .completed_values
L58: .pop()
L1: use crate::jsonish::{
L2: parser::{entry, ParsingMode},
L3: Value,
L4: };
L5:
L6: use super::ParseOptions;
L7: use anyhow::Result;
L8:
L9: #[derive(Debug)]
L10: pub enum MarkdownResult {
L11: CodeBlock(String, Value),
L12: String(String),
L13: }
L14:
L15: pub fn parse(str: &str, options: &ParseOptions) -> Result<Vec<MarkdownResult>> {
L16: let mut values = vec![];
L17:
L18: let mut remaining = str;
L19: // Find regex for markdown blocks (```<tag><EOF|newline>)
L20:
L21: let md_tag_start = regex::Regex::new(r"```([a-zA-Z0-9 ]+)(?:\n|$)")
L22: .map_err(|e| anyhow::Error::from(e).context("Failed to build regex for md-tag-start"))?;
L23: let md_tag_end = regex::Regex::new(r"```(?:\n|$)")
L24: .map_err(|e| anyhow::Error::from(e).context("Failed to build regex for md-tag-end"))?;
L25:
L26: let mut should_loop = true;
L27:
L28: while let Some(cap) = md_tag_start.find(remaining) {
L29: let tag = cap.as_str();
L30: log::trace!("Found tag: {:#?}", cap);
L31:
L32: let md_content = if let Some(end) = md_tag_end.find(&remaining[cap.end()..]) {
L33: let next = remaining[cap.end()..cap.end() + end.start()].trim();
L34: remaining = &remaining[cap.end() + end.end()..];
L35: next
L36: } else {
L37: should_loop = false;
L38: remaining[cap.end()..].trim()
L39: };
L40:
L41: log::trace!("Content:\n-----\n{}\n-----\n", md_content);
L42:
L43: let res = entry::parse(
L44: md_content,
L45: options.next_from_mode(ParsingMode::JsonMarkdown),
L46: );
L47:
L48: match res {
L49: Ok(v) => {
L50: // TODO: Add any more additional strings here.
L51: values.push(MarkdownResult::CodeBlock(
L52: if tag.len() > 3 {
L53: tag[3..].trim()
L54: } else {
L55: "<unspecified>"
L56: }
L57: .to_string(),
L58: v,
L59: ));
L60: }
L61: Err(e) => {
L62: log::debug!("Error parsing markdown block: Tag: {tag}\n{:?}", e);
L63: }
L64: };
L65:
L66: if !should_loop {
L67: break;
L68: }
L69: }
L70:
L71: if values.is_empty() {
L72: anyhow::bail!("No markdown blocks found")
L73: } else {
L74: if !remaining.trim().is_empty() {
L75: values.push(MarkdownResult::String(remaining.to_string()));
L76: }
L77: Ok(values)
L78: }
L79: }
L80:
L81: #[cfg(test)]
L82: mod test {
L83: use super::*;
L84: use test_log::test;
L85:
L86: #[test]
L87: fn basic_parse() -> Result<()> {
L88: let res = parse(
L89: r#"```json
L90: {
L91: "a": 1
L92: }
L93: ```
L94:
L95: Also we've got a few more!
L96: ```python
L97: print("Hello, world!")
L98: ```
L99:
L100: ```test json
L101: "This is a test"
L102: ```
L103: "#,
L104: &ParseOptions::default(),
L105: );
L106:
L107: let res = res?;
L108: assert_eq!(res.len(), 2);
L109: {
L110: let (tag, value) = if let MarkdownResult::CodeBlock(tag, value) = &res[0] {
L111: (tag, value)
L112: } else {
L113: panic!("Expected CodeBlock, got {:#?}", res[0]);
L114: };
L115: assert_eq!(tag, "json");
L116:
L117: let Value::AnyOf(value, _) = value else {
L118: panic!("Expected AnyOf, got {:#?}", value);
L119: };
L120: assert!(value.contains(&Value::Object(
L121: [("a".to_string(), Value::Number((1).into()))]
L122: .into_iter()
L123: .collect()
L124: )));
L125: }
L126: {
L127: let (tag, value) = if let MarkdownResult::CodeBlock(tag, value) = &res[1] {
L128: (tag, value)
L129: } else {
L130: panic!("Expected CodeBlock, got {:#?}", res[0]);
L131: };
L132: assert_eq!(tag, "test json");
L133:
L134: let Value::AnyOf(value, _) = value else {
L135: panic!("Expected AnyOf, got {:#?}", value);
L136: };
L137: assert!(value.contains(&Value::String("This is a test".to_string())));
L138: }
L139:
L140: Ok(())
L141: }
L142:
L143: #[test(should_panic)]
L144: fn untagged_blocks() -> Result<()> {
L145: let res = parse(
L146: r#"
L147: lorem ipsum
L148:
L149: ```
L150: "block1"
L151: ```
L152:
L153: "here is some text in between"
L154:
L155: ```
L156: "block2"
L157: ```
L158:
L159: dolor sit amet
L160: "#,
L161: &ParseOptions::default(),
L162: );
L163:
L164: let res = res?;
L165: assert_eq!(res.len(), 2);
L166:
L167: Ok(())
L168: }
L169:
L170: #[test]
L171: fn utf8_between_blocks() -> Result<()> {
L172: let res = parse(
L173: r#"
L174: lorem ipsum
L175:
L176: ```json
L177: "block1"
L178: ```
L179:
L180: 🌅🌞🏖️🏊♀️🐚🌴🍹🌺🏝️🌊👒😎👙🩴🐠🚤🍉🎣🎨📸🎉💃🕺🌙🌠🍽️🎶✨🌌🏕️🔥🌲🌌🌟💤
L181:
L182: ```json
L183: "block2"
L184: ```
L185:
L186: dolor sit amet
L187: "#,
L188: &ParseOptions::default(),
L189: );
L190:
L191: let res = res?;
L192: assert_eq!(res.len(), 3);
L193:
L194: // Ensure the types of each.
L195: assert!(matches!(&res[0], MarkdownResult::CodeBlock(tag, _) if tag == "json"));
L196: assert!(matches!(&res[1], MarkdownResult::CodeBlock(tag, _) if tag == "json"));
L197: match &res[2] {
L198: MarkdownResult::String(s) => assert_eq!(s.trim(), "dolor sit amet"),
L199: _ => panic!("Expected String, got {:#?}", res[2]),
L200: }
L201:
L202: Ok(())
L203: }
L204: }
L205:
</src/jsonish/parser/markdown_parser.rs>
<src/jsonish/parser/mod.rs>
L1: mod entry;
L2: mod fixing_parser;
L3: mod markdown_parser;
L4: mod multi_json_parser;
L5:
L6: pub use entry::parse;
L7:
L8: #[derive(Clone, Copy, Debug)]
L9: pub struct ParseOptions {
L10: all_finding_all_json_objects: bool,
L11: allow_markdown_json: bool,
L12: allow_fixes: bool,
L13: allow_as_string: bool,
L14: depth: usize,
L15: }
L16:
L17: impl Default for ParseOptions {
L18: fn default() -> Self {
L19: Self {
L20: all_finding_all_json_objects: true,
L21: allow_markdown_json: true,
L22: allow_fixes: true,
L23: allow_as_string: true,
L24: depth: 0,
L25: }
L26: }
L27: }
L28:
L29: pub(super) enum ParsingMode {
L30: JsonMarkdown,
L31: JsonMarkdownString,
L32: AllJsonObjects,
L33: }
L34:
L35: impl ParseOptions {
L36: pub(super) fn next_from_mode(&self, curr_mode: ParsingMode) -> Self {
L37: let mut new = *self;
L38: match curr_mode {
L39: ParsingMode::JsonMarkdownString => {
L40: new.allow_markdown_json = false;
L41: new.allow_as_string = true;
L42: }
L43: ParsingMode::JsonMarkdown => {
L44: new.allow_markdown_json = false;
L45: new.allow_as_string = false;
L46: }
L47: ParsingMode::AllJsonObjects => {
L48: new.all_finding_all_json_objects = false;
L49: new.allow_as_string = false;
L50: }
L51: }
L52: new
L53: }
L54: }
L55:
</src/jsonish/parser/mod.rs>
<src/jsonish/parser/multi_json_parser.rs>
L1: use crate::jsonish::Value;
L2:
L3: use super::{entry, ParseOptions};
L4: use anyhow::Result;
L5:
L6: pub fn parse(str: &str, options: &ParseOptions) -> Result<Vec<Value>> {
L7: // Find all balanced JSON objects but w/o any fixes.
L8: let mut stack = Vec::new();
L9: let mut json_str_start = None;
L10: let mut json_objects = Vec::new();
L11:
L12: for (index, character) in str.char_indices() {
L13: match character {
L16: json_str_start = Some(index);
L17: }
L18: stack.push(character);
L19: }
L20: '}' | ']' => {
L21: if let Some(last) = stack.last() {
L22: let expected_open = if character == '}' { '{' } else { '[' };
L23: if *last == expected_open {
L24: stack.pop();
L25: } else {
L26: return Err(anyhow::anyhow!("Mismatched brackets"));
L27: }
L28: }
L29:
L30: if stack.is_empty() {
L31: let end_index = index + 1;
L32: let json_str = if let Some(start) = json_str_start {
L33: &str[start..end_index]
L34: } else {
L35: &str[..end_index]
L36: };
L37: match entry::parse(
L38: json_str,
L39: options.next_from_mode(super::ParsingMode::AllJsonObjects),
L40: ) {
L41: Ok(json) => json_objects.push(json),
L42: Err(e) => {
L43: // Ignore errors
L44: log::error!("Failed to parse JSON object: {:?}", e);
L45: }
L46: }
L47: }
L48: }
L49: _ => {}
L50: }
L51: }
L52:
L53: if !stack.is_empty() {
L54: // We reached the end but the stack is not empty
L55: match json_str_start {
L56: Some(start) => {
L57: let json_str = &str[start..];
L58: match entry::parse(
L59: json_str,
L60: options.next_from_mode(super::ParsingMode::AllJsonObjects),
L61: ) {
L62: Ok(json) => json_objects.push(json),
L63: Err(e) => {
L64: // Ignore errors
L65: log::error!("Failed to parse JSON object: {:?}", e);
L66: }
L67: }
L68: }
L69: None => {
L70: log::error!("Unexpected state: stack is not empty but no JSON start was found");
L71: }
L72: }
L73: }
L74:
L75: match json_objects.len() {
L76: 0 => Err(anyhow::anyhow!("No JSON objects found")),
L77: _ => Ok(json_objects),
L78: }
L79: }
L80:
L81: // #[cfg(test)]
L82: // mod test {
L83: // use super::*;
L84: // use test_log::test;
L85:
L86: // #[test]
L87: // fn test_parse() -> Result<()> {
L88: // let res = parse(
L89: // r#"```json
L90: // {
L91: // "a": 1
L92: // }
L93: // ```
L94:
L95: // Also we've got a few more!
L96: // ```python
L97: // print("Hello, world!")
L98: // ```
L99:
L100: // ```test json
L101: // ["This is a test"]
L102: // ```
L103: // "#,
L104: // &ParseOptions::default(),
L105: // );
L106:
L107: // let res = res?;
L108: // assert_eq!(res.len(), 2);
L109: // {
L110: // let value = &res[0];
L111: // let Value::AnyOf(value, _) = value else {
L112: // panic!("Expected AnyOf, got {:#?}", value);
L113: // };
L114: // assert!(value.contains(&Value::Object(
L115: // [("a".to_string(), Value::Number((1).into()))]
L116: // .into_iter()
L117: // .collect()
L118: // )));
L119: // }
L120: // {
L121: // let value = &res[1];
L122: // let Value::AnyOf(value, _) = value else {
L123: // panic!("Expected AnyOf, got {:#?}", value);
L124: // };
L125: // assert!(value.contains(&Value::Array(vec![Value::String(
L126: // "This is a test".to_string()
L127: // )])));
L128: // }
L129:
L130: // Ok(())
L131: // }
L132: // }
L133:
</src/jsonish/parser/multi_json_parser.rs>
<src/jsonish/parser/fixing_parser/json_collection.rs>
L1: // use baml_types::BamlMap;
L2: // use bstd::dedent;
L3:
L4: use crate::jsonish::Value;
L5:
L6: #[derive(Debug)]
L7: pub enum JsonCollection {
L8: // Key, Value
L9: Object(Vec<String>, Vec<Value>),
L10: Array(Vec<Value>),
L11: QuotedString(String),
L12: TripleQuotedString(String),
L13: SingleQuotedString(String),
L14: // edge cases that need handling:
L15: // - triple backticks in a triple backtick string
L16: // - will the LLM terminate a triple backtick with a single backtick? probably not
L17: // - do we give the language specifier out? no
L18: // - what if the triple backtick block contains both a lang and path specifier? e.g. ```tsx path/to/file.tsx
L19: // should we hand back the path?
L20: // - do we dedent the output?
L21: // - is it an acceptable heuristic to discard the first line of a triple backtick block?
L22: TripleBacktickString {
L23: lang: Option<String>,
L24: path: Option<String>,
L25: content: String,
L26: },
L27: BacktickString(String),
L28: // Handles numbers, booleans, null, and unquoted strings
L29: UnquotedString(String),
L30: // Starting with // or #
L31: TrailingComment(String),
L32: // Content between /* and */
L33: BlockComment(String),
L34: }
L35:
L36: impl JsonCollection {
L37: pub fn name(&self) -> &'static str {
L38: match self {
L39: JsonCollection::Object(_, _) => "Object",
L40: JsonCollection::Array(_) => "Array",
L41: JsonCollection::QuotedString(_) => "String",
L42: JsonCollection::SingleQuotedString(_) => "String",
L43: JsonCollection::TripleBacktickString { .. } => "TripleBacktickString",
L44: JsonCollection::BacktickString(_) => "String",
L45: JsonCollection::TripleQuotedString(_) => "TripleQuotedString",
L46: JsonCollection::UnquotedString(_) => "UnquotedString",
L47: JsonCollection::TrailingComment(_) => "Comment",
L48: JsonCollection::BlockComment(_) => "Comment",
L49: }
L50: }
L51: }
L52:
L53: impl From<JsonCollection> for Option<Value> {
L54: fn from(collection: JsonCollection) -> Option<Value> {
L55: Some(match collection {
L56: JsonCollection::TrailingComment(_) | JsonCollection::BlockComment(_) => return None,
L57: JsonCollection::Object(keys, values) => {
L58: // log::debug!("keys: {:?}", keys);
L59: let mut object = Vec::new();
L60: for (key, value) in keys.into_iter().zip(values.into_iter()) {
L61: object.push((key, value));
L62: }
L63: Value::Object(object)
L64: }
L65: JsonCollection::Array(values) => Value::Array(values),
L66: JsonCollection::QuotedString(s) => Value::String(s),
L67: JsonCollection::TripleQuotedString(s) => Value::String(s),
L68: JsonCollection::SingleQuotedString(s) => Value::String(s),
L69: JsonCollection::TripleBacktickString { content, .. } => {
L70: let Some((fenced_codeblock_info, codeblock_contents)) = content.split_once("\n")
L71: else {
L72: return Some(Value::String(content));
L73: };
L74:
L75: Value::String(codeblock_contents.into())
L76: // todo - dedent the codeblock look at engine/bstd/src/dedent.rs
L77: // Value::String(dedent(codeblock_contents).content)
L78: }
L79: JsonCollection::BacktickString(s) => Value::String(s),
L80: JsonCollection::UnquotedString(s) => {
L81: let s = s.trim();
L82: if s == "true" {
L83: Value::Boolean(true)
L84: } else if s == "false" {
L85: Value::Boolean(false)
L86: } else if s == "null" {
L87: Value::Null
L88: } else if let Ok(n) = s.parse::<i64>() {
L89: Value::Number(n.into())
L90: } else if let Ok(n) = s.parse::<u64>() {
L91: Value::Number(n.into())
L92: } else if let Ok(n) = s.parse::<f64>() {
L93: match serde_json::Number::from_f64(n) {
L94: Some(n) => Value::Number(n),
L95: None => Value::String(s.into()),
L96: }
L97: } else {
L98: Value::String(s.into())
L99: }
L100: }
L101: })
L102: }
L103: }
L104:
</src/jsonish/parser/fixing_parser/json_collection.rs>
<src/jsonish/parser/fixing_parser/json_parse_state.rs>
L1: use std::iter::Peekable;
L2:
L3: use crate::jsonish::{value::Fixes, Value};
L4: use anyhow::Result;
L5:
L6: use super::json_collection::JsonCollection;
L7:
L8: pub struct JsonParseState {
L9: pub collection_stack: Vec<(JsonCollection, Vec<Fixes>)>,
L10:
L11: // Technically we may find multiple values in a single string
L12: pub completed_values: Vec<(&'static str, Value, Vec<Fixes>)>,
L13: }
L14:
L15: impl JsonParseState {
L16: pub fn new() -> Self {
L17: JsonParseState {
L18: collection_stack: vec![],
L19: completed_values: vec![],
L20: }
L21: }
L22:
L23: pub fn complete_collection(&mut self) {
L24: let (collection, fixes) = match self.collection_stack.pop() {
L25: Some(collection) => collection,
L26: None => return,
L27: };
L28:
L29: let name = collection.name();
L30:
L31: let value: Value = match collection.into() {
L32: Some(value) => value,
L33: None => return,
L34: };
L35:
L36: if let Some((last, _fixes)) = self.collection_stack.last_mut() {
L37: match last {
L38: JsonCollection::Object(keys, values) => {
L39: if keys.len() == values.len() {
L40: match value {
L41: Value::String(s) => keys.push(s),
L42: Value::AnyOf(_, s) => keys.push(s),
L43: _ => keys.push(value.to_string()),
L44: }
L45: } else {
L46: values.push(value);
L47: }
L48: }
L49: JsonCollection::Array(values) => {
L50: values.push(value);
L51: }
L52: _ => {
L53: // TODO: this should never happen as we should only be pushing objects and arrays
L54: panic!(
L55: "Unexpected value: {:?} in collection stack: {:?}",
L56: value, last
L57: );
L58: }
L59: }
L60: } else {
L61: self.completed_values.push((name, value, fixes));
L62: }
L63: }
L64:
L65: fn consume(&mut self, token: char) -> Result<usize> {
L66: let Some((last, _)) = self.collection_stack.last_mut() else {
L67: return Err(anyhow::anyhow!(
L68: "No collection to consume token: {:?}",
L69: token
L70: ));
L71: };
L72: match last {
L73: JsonCollection::QuotedString(s)
L76: | JsonCollection::SingleQuotedString(s)
L77: | JsonCollection::BacktickString(s)
L78: | JsonCollection::TripleBacktickString { content: s, .. }
L79: | JsonCollection::UnquotedString(s)
L80: | JsonCollection::TrailingComment(s) => {
L81: // println!("Consuming: {s} + {:?}", token);
L82: s.push(token);
L83: }
L84: JsonCollection::Object(_, _) | JsonCollection::Array(_) => {
L85: panic!("Unexpected token: {:?} in: {:?}", token, last);
L86: }
L87: }
L88: Ok(0)
L89: }
L90:
L91: fn is_string_complete(&self) -> bool {
L92: let Some((JsonCollection::UnquotedString(v), _)) = self.collection_stack.last() else {
L93: return false;
L94: };
L95:
L96: // Check if the token is a valid json character
L97: match v.as_str() {
L98: "true" | "false" | "null" => true,
L99: _ => {
L100: // Check if the token parses as a number
L101: if v.parse::<f64>().is_ok() {
L102: return true;
L103: }
L104: false
L105: }
L106: }
L107: }
L108:
L109: fn should_close_unescaped_string(
L110: &mut self,
L111: mut next: Peekable<impl Iterator<Item = (usize, char)>>,
L112: ) -> Option<usize> {
L113: let pos = if self.collection_stack.len() >= 2 {
L114: self.collection_stack
L115: .get(self.collection_stack.len() - 2)
L116: .map(|(c, _)| match c {
L117: JsonCollection::Object(keys, values) => {
L118: if keys.len() == values.len() {
L119: 2
L120: } else {
L121: 3
L122: }
L123: }
L124: JsonCollection::Array(_) => 4,
L125: _ => 1,
L126: })
L127: .unwrap()
L128: } else {
L129: 0
L130: };
L131: match pos {
L132: 0 => {
L133: // in nothing, so perhaps the first '{' or '[' is the start of a new object or array
L134: let mut counter = 0;
L135: for (idx, c) in next.by_ref() {
L136: counter = idx;
L137: match c {
L138: // If at some point we find a valid json character, we'll close the string
L139: '{' | '[' => return Some(idx),
L140: x => {
L141: let _ = self.consume(x);
L142: }
L143: }
L144: }
L145: Some(counter)
L146: }
L147: 1 => None,
L148: 2 => {
L149: // in object key
L150: let mut counter = 0;
L151: for (idx, c) in next.by_ref() {
L152: counter = idx;
L153: match c {
L154: ':' => return Some(idx),
L155: x => {
L156: let _ = self.consume(x);
L157: }
L158: }
L159: }
L160: Some(counter)
L161: }
L162: 3 => {
L163: // in object value
L164: let mut counter = 0;
L165: while let Some((idx, c)) = next.next() {
L166: counter = idx;
L167: match c {
L168: ',' => {
L169: // Check if we have just numeric values in the string so far.
L170: let Some((JsonCollection::UnquotedString(current_value), _)) =
L171: self.collection_stack.last()
L172: else {
L173: return Some(idx);
L174: };
L175:
L176: // current value could be a numeric looking things.
L177: let is_numeric = current_value.trim().parse::<f64>().is_ok();
L178: let is_bool = current_value.trim().eq_ignore_ascii_case("true")
L179: || current_value.trim().eq_ignore_ascii_case("false");
L180: let is_null = current_value.trim().eq_ignore_ascii_case("null");
L181: let is_possible_value = is_numeric || is_bool || is_null;
L182:
L183: if let Some((_, next_c)) = next.peek() {
L184: match next_c {
L185: '\n' => {
L186: log::debug!("Closing due to: newline after comma");
L187: return Some(idx);
L188: }
L189: ' ' => {
L190: log::debug!("Testing for comment after space + comma");
L191: if is_possible_value {
L192: return Some(idx);
L193: }
L194: // If after the space we have "//" or "/*" or the beginning of a key, we'll close the string
L195: let mut buffer = ",".to_string();
L196: let mut anything_but_whitespace = false;
L197: while let Some((_, next_next_c)) = next.next() {
L198: anything_but_whitespace = anything_but_whitespace
L199: || !next_next_c.is_whitespace();
L200: buffer.push(next_next_c);
L201: match next_next_c {
L202: ' ' => {}
L203: '\n' => {
L204: if anything_but_whitespace {
L205: } else {
L206: // Likely end of the key as the LLM generated a ", " token by mistake instead of a ","
L207: // so drop the comma
L208: log::debug!("Closing due to: newline after comma + space");
L209: return Some(idx);
L210: }
L211: }
L212: '/' => match next.peek() {
L213: Some((_, '/')) => {
L214: // This is likely a comment
L215: return Some(idx);
L216: }
L217: Some((_, '*')) => {
L218: // This is likely a comment
L219: return Some(idx);
L220: }
L221: _ => {
L222: // let _ = self.consume(c);
L223: }
L224: },
L225: '"' => {
L226: // This is likely a new key
L227: log::debug!("Closing due to: new key after space + comma");
L228: return Some(idx);
L229: }
L230: _x => {
L231: break;
L232: }
L233: }
L234: }
L235: for c in buffer.chars() {
L236: let _ = self.consume(c);
L237: }
L238: }
L239: _ => {
L240: let _ = self.consume(c);
L241: }
L242: }
L243: } else {
L244: // Don't include the comma
L245: return Some(idx);
L246: }
L247: }
L248: '}' => return Some(idx),
L249: x => {
L250: let _ = self.consume(x);
L251: }
L252: }
L253: }
L254: Some(counter)
L255: }
L256: 4 => {
L257: // in array
L258: let mut counter = 0;
L259: for (idx, c) in next {
L260: counter = idx;
L261: match c {
L262: ',' => return Some(idx),
L263: ']' => return Some(idx),
L264: x => {
L265: let _ = self.consume(x);
L266: }
L267: }
L268: }
L269: counter += 1; // Indicate that we called next() one time after the final `Some`.
L270: Some(counter)
L271: }
L272: _ => unreachable!("Invalid position"),
L273: }
L274: }
L275:
L276: fn should_close_string(
L277: &mut self,
L278: mut next: Peekable<impl Iterator<Item = (usize, char)>>,
L279: closing_char: char,
L280: ) -> bool {
L281: let (has_some_object, in_object_key, in_object_value, in_array) =
L282: if self.collection_stack.len() >= 2 {
L283: self.collection_stack
L284: .get(self.collection_stack.len() - 2)
L285: .map(|(c, _)| match c {
L286: JsonCollection::Object(keys, values) => {
L287: if keys.len() == values.len() {
L288: (true, false, false)
L289: } else {
L290: (false, true, true)
L291: }
L292: }
L293: JsonCollection::Array(_) => (false, false, true),
L294: _ => (false, false, false),
L295: })
L296: .map(|(a, b, c)| (true, a, b, c))
L297: .unwrap()
L298: } else {
L299: (false, false, false, false)
L300: };
L301:
L302: if let Some((idx, next_char)) = next.peek() {
L303: let _idx = *idx;
L304: match next_char {
L305: ':' | '}' if in_object_key => {
L306: // We're ready to close the key
L307: log::debug!("Closing due to: key");
L308: true
L309: }
L310: ',' | '}' if in_object_value => {
L311: // We're ready to close the value
L312: log::debug!("Closing due to: value",);
L313: true
L314: }
L315: ',' | ']' if in_array => {
L316: // We're ready to close the value
L317: log::debug!("Closing due to: array");
L318: true
L319: }
L320: ' ' | '\t' | '\n' => {
L321: // look ahead and see if we can find a closing bracket or comma
L322: while let Some((_, c)) = next.next() {
L323: match c {
L324: ' ' | '\t' | '\n' => {}
L325: '}' if in_object_key || in_object_value => return true,
L326: ':' if in_object_key => return true,
L327: ',' if in_object_value => return true,
L328: ',' | ']' if in_array => return true,
L329: '/' => {
L330: // Could be a comment
L331: match next.peek() {
L332: Some((_, '/')) => {
L333: // We're ready to close the comment
L334: return true;
L335: }
L336: Some((_, '*')) => {
L337: // We're ready to close the comment
L338: return true;
L339: }
L340: _ => return false,
L341: }
L342: }
L343: _ => return false,
L344: }
L345: }
L346: // If we faile, terminate the string
L347: true
L348: }
L349: x if closing_char == *x => {
L350: // We'll close the string the next time around.
L351: false
L352: }
L353: '{' | '"' | '\'' | '[' => {
L354: if !has_some_object {
L355: // We're in a string
L356: true
L357: } else {
L358: false
L359: }
L360: }
L361: _ => {
L362: // Almost every other character should not close the string
L363: false
L364: }
L365: }
L366: } else {
L367: true
L368: }
L369: }
L370:
L371: pub fn process_token(
L372: &mut self,
L373: token: char,
L374: mut next: Peekable<impl Iterator<Item = (usize, char)>>,
L375: ) -> Result<usize> {
L376: // println!("Processing: {:?}..{:?}", token, next.peek());
L377: match self.collection_stack.last() {
L378: Some((last, _)) => match last {
L379: JsonCollection::Object(_, _) => {
L380: match token {
L381: '}' => {
L382: // We're ready to close the object
L383: self.complete_collection();
L384: Ok(0)
L385: }
L386: // We can safely ignore these tokens
L387: ',' | ':' => Ok(0),
L388: // look for a new key or value
L389: _ => self.find_any_starting_value(token, next),
L390: }
L391: }
L392: JsonCollection::Array(_) => {
L393: // We could be expecting:
L394: // - A value
L395: // - a comma
L396: // - a closing bracket
L397: match token {
L398: ']' => {
L399: // We're ready to close the array
L400: self.complete_collection();
L401: Ok(0)
L402: }
L403: // Skip these tokens
L404: ',' => Ok(0),
L405: _ => self.find_any_starting_value(token, next),
L406: }
L407: }
L408: JsonCollection::TripleQuotedString(_) => {
L409: // We should be expecting:
L410: if token == '"' {
L411: // TODO: this logic is busted. peekable.peek() does not
L412: // advance the iterator (this is easily verified with
L413: // a unit test), but to fix this we need to do a bit of
L414: // refactoring, so for now we'll live with it.
L415: let is_triple_quoted = match next.peek() {
L416: Some((_, '"')) => matches!(next.peek(), Some((_, '"')) | None),
L417: None => true,
L418: _ => false,
L419: };
L420:
L421: if is_triple_quoted {
L422: self.complete_collection();
L423: Ok(3)
L424: } else {
L425: self.consume(token)
L426: }
L427: } else {
L428: self.consume(token)
L429: }
L430: }
L431: JsonCollection::QuotedString(_) => {
L432: // We could be expecting:
L433: // - A closing quote
L434: // - A character
L435: match token {
L436: '"' => {
L437: // It's possible that the LLM messed up the escaping
L438: // We'll try to fix it.
L439: if self.should_close_string(next, '"') {
L440: self.complete_collection();
L441: Ok(0)
L442: } else {
L443: self.consume(token)
L444: }
L445: }
L446: '\\' => {
L447: // Capture escaped characters
L448: match next.peek() {
L449: Some((_, 'n')) => {
L450: self.consume('\n')?;
L451: Ok(1)
L452: }
L453: Some((_, 't')) => {
L454: self.consume('\t')?;
L455: Ok(1)
L456: }
L457: Some((_, 'r')) => {
L458: self.consume('\r')?;
L459: Ok(1)
L460: }
L461: Some((_, 'b')) => {
L462: self.consume('\x08')?;
L463: Ok(1)
L464: }
L465: Some((_, 'f')) => {
L466: self.consume('\x0C')?;
L467: Ok(1)
L468: }
L469: Some((_, '\\')) => {
L470: self.consume('\\')?;
L471: Ok(1)
L472: }
L473: Some((_, '"')) => {
L474: self.consume('"')?;
L475: Ok(1)
L476: }
L477: Some((_, 'u')) => {
L478: // We'll consume the 'u' and the next 4 characters
L479: let mut buffer = String::new();
L480: buffer.push(token);
L481: for _ in 0..4 {
L482: if let Some((_, c)) = next.next() {
L483: buffer.push(c);
L484: } else {
L485: break;
L486: }
L487: }
L488: for c in buffer.chars() {
L489: let _ = self.consume(c);
L490: }
L491: Ok(5)
L492: }
L493: _ => self.consume(token),
L494: }
L495: }
L496: _ => self.consume(token),
L497: }
L498: }
L499: JsonCollection::TripleBacktickString { .. } => {
L500: // We could be expecting:
L501: // - A closing backtick
L502: // - A character
L503: if token == '`' {
L504: // TODO: this logic is busted. peekable.peek() does not
L505: // advance the iterator (this is easily verified with
L506: // a unit test), but to fix this we need to do a bit of
L507: // refactoring, so for now we'll live with it.
L508: let is_triple_quoted = match next.peek() {
L509: Some((_, '`')) => matches!(next.peek(), Some((_, '`')) | None),
L510: None => true,
L511: _ => false,
L512: };
L513:
L514: if is_triple_quoted {
L515: self.complete_collection();
L516: Ok(3)
L517: } else {
L518: self.consume(token)
L519: }
L520: } else {
L521: self.consume(token)
L522: }
L523: }
L524: JsonCollection::BacktickString(_) => {
L525: // We could be expecting:
L526: // - A closing backtick
L527: // - A character
L528: match token {
L529: '`' => {
L530: if self.should_close_string(next, '`') {
L531: self.complete_collection();
L532: Ok(0)
L533: } else {
L534: self.consume(token)
L535: }
L536: }
L537: _ => self.consume(token),
L538: }
L539: }
L540: JsonCollection::SingleQuotedString(_) => {
L541: // We could be expecting:
L542: // - A closing quote
L543: // - A character
L544: // - A space
L545: match token {
L546: '\'' => {
L547: // It's possible that the LLM messed up the escaping
L548: // We'll try to fix it.
L549: if self.should_close_string(next, '\'') {
L550: self.complete_collection();
L551: Ok(0)
L552: } else {
L553: self.consume(token)
L554: }
L555: }
L556: _ => self.consume(token),
L557: }
L558: }
L559: JsonCollection::UnquotedString(_) => {
L560: // We could be expecting:
L561: // - A terminating json character (comma, colon, bracket, space, newline)
L562: // - A character
L563: let res = self.consume(token);
L564: if let Some(count) = self.should_close_unescaped_string(next) {
L565: self.complete_collection();
L566: Ok(count)
L567: } else {
L568: res
L569: }
L570: }
L571: JsonCollection::TrailingComment(_) => {
L572: // We could be expecting:
L573: // - A newline
L574: // - A character
L575: match token {
L576: '\n' => {
L577: // We're ready to close the comment
L578: self.complete_collection();
L579: Ok(0)
L580: }
L581: _ => self.consume(token),
L582: }
L583: }
L584: JsonCollection::BlockComment(_) => {
L585: // We could be expecting:
L586: // - A closing comment
L587: // - A character
L588: match token {
L589: '*' => {
L590: // We could be closing the comment
L591: match next.peek() {
L592: Some((_, '/')) => {
L593: // We're ready to close the comment
L594: self.complete_collection();
L595: Ok(1)
L596: }
L597: _ => Ok(0),
L598: }
L599: }
L600: _ => self.consume(token),
L601: }
L602: }
L603: },
L604: None => {
L605: // We could be expecting:
L606: // - A value
L607: // - Any leading whitespace
L608: let preview = next.peekable();
L609: self.find_any_starting_value(token, preview)
L610: }
L611: }
L612: }
L613:
L614: // Returns the number of increments to skip after processing the token
L615: fn find_any_starting_value(
L616: &mut self,
L617: token: char,
L618: mut next: Peekable<impl Iterator<Item = (usize, char)>>,
L619: ) -> Result<usize> {
L620: match token {
L621: '{' => {
L622: self.collection_stack
L623: .push((JsonCollection::Object(vec![], vec![]), Default::default()));
L624: }
L625: '[' => {
L626: self.collection_stack
L627: .push((JsonCollection::Array(vec![]), Default::default()));
L628: }
L629: '"' => {
L630: // Peek if next 2 characters are also quotes
L631: let is_triple_quoted = {
L632: next.next_if(|&(_, c)| c == '"')
L633: .and_then(|_| next.next_if(|&(_, c)| c == '"'))
L634: .is_some()
L635: };
L636:
L637: if is_triple_quoted {
L638: self.collection_stack.push((
L639: JsonCollection::TripleQuotedString(String::new()),
L640: Default::default(),
L641: ));
L642: return Ok(2);
L643: } else {
L644: self.collection_stack.push((
L645: JsonCollection::QuotedString(String::new()),
L646: Default::default(),
L647: ))
L648: }
L649: }
L650: '\'' => {
L651: self.collection_stack.push((
L652: JsonCollection::SingleQuotedString(String::new()),
L653: Default::default(),
L654: ));
L655: }
L656: '`' => {
L657: // Peek if next 2 characters are also quotes
L658: let is_triple_quoted = {
L659: next.next_if(|&(_, c)| c == '`')
L660: .and_then(|_| next.next_if(|&(_, c)| c == '`'))
L661: .is_some()
L662: };
L663:
L664: if is_triple_quoted {
L665: self.collection_stack.push((
L666: JsonCollection::TripleBacktickString {
L667: lang: None,
L668: path: None,
L669: content: String::new(),
L670: },
L671: Default::default(),
L672: ));
L673: return Ok(2);
L674: } else {
L675: self.collection_stack.push((
L676: JsonCollection::BacktickString(String::new()),
L677: Default::default(),
L678: ))
L679: }
L680: }
L681: '/' => {
L682: // Could be a comment
L683: match next.peek() {
L684: Some((_, '/')) => {
L685: self.collection_stack.push((
L686: JsonCollection::TrailingComment(String::new()),
L687: Default::default(),
L688: ));
L689: return Ok(1);
L690: }
L691: Some((_, '*')) => {
L692: self.collection_stack.push((
L693: JsonCollection::BlockComment(String::new()),
L694: Default::default(),
L695: ));
L696: return Ok(1);
L697: }
L698: _ => {
L699: // if we're in an object, this could be the beginning of a string
L700: // say a path?
L701: if matches!(
L702: self.collection_stack.last(),
L703: Some((JsonCollection::Object(_, _), _))
L704: ) {
L705: self.collection_stack.push((
L706: JsonCollection::UnquotedString(token.into()),
L707: Default::default(),
L708: ));
L709: return Ok(0);
L710: }
L711: }
L712: }
L713: }
L714: x if x.is_whitespace() => {}
L715: x => {
L716: self.collection_stack
L717: .push((JsonCollection::UnquotedString(x.into()), Default::default()));
L718: if let Some(count) = self.should_close_unescaped_string(next) {
L719: self.complete_collection();
L720: return Ok(count);
L721: }
L722: }
L723: };
L724:
L725: Ok(0)
L726: }
L727: }
L728:
</src/jsonish/parser/fixing_parser/json_parse_state.rs>
cargo test fails :
look at the left and right output. left is actual, right is expected
---- jsonish::tests::test_to_json_string_compact stdout ----
thread 'jsonish::tests::test_to_json_string_compact' panicked at src/jsonish/mod.rs:66:9:
assertion `left == right` failed
left: "{\"naem\":\"Alice\\\" age:30, extra:\\\"remove me\",\"yap\":\" noisy message \"}"
right: "{\"age\":30,\"naem\":\"Alice\", extra:\"remove me\",\"yap\":\" noisy message \"}"
stack backtrace:
0: rust_begin_unwind
-----
Change the code to ensure that the test pass.